# clear your environment
rm(list = ls(all=TRUE))

Reading & Understanding the Data

  • Make sure the dataset is located in your current working directory
input_data = read.csv('trainDF_for_Clustering_21-15-35.csv', header = T)
  • Use the str(), summary() functions to get a feel for the dataset.
str(input_data)
## 'data.frame':    11071 obs. of  20 variables:
##  $ CustomerID                          : Factor w/ 11071 levels "C12116","C12117",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ ProfessionalLicensure               : int  0 1 0 0 1 0 0 1 0 1 ...
##  $ eCommerceAccount                    : int  1 1 1 1 1 1 1 1 0 1 ...
##  $ SocialMediaAccount                  : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 1 2 2 1 ...
##  $ NoOfProperties                      : int  2 1 1 1 1 1 4 1 1 2 ...
##  $ CreditRiskScore                     : int  711 767 827 795 827 738 670 674 560 744 ...
##  $ age_in_years                        : int  50 38 49 43 44 40 37 36 37 40 ...
##  $ Total_no_of_loan_tenure_with_current: int  24 10 35 35 35 27 27 27 11 27 ...
##  $ Asset_type                          : Factor w/ 2 levels "Complex With shared services",..: 2 2 2 1 2 2 2 2 2 2 ...
##  $ Villa_House                         : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 1 2 2 2 2 ...
##  $ Investment_SelfOccupied             : Factor w/ 2 levels "Investment","Self Occupancy": 2 2 2 2 2 2 1 2 2 2 ...
##  $ Payment_Status                      : Factor w/ 3 levels "Default","Non-Payoff/Non-Default",..: 1 1 2 2 2 2 2 2 1 2 ...
##  $ Remaining_loan_tenure               : int  65 104 45 46 47 80 80 80 98 81 ...
##  $ ratio_RE_inflation                  : num  2.03 1.1 2.27 2.25 2.26 ...
##  $ ratio_Loan_to_Appraisedvalu_Percent : num  0.379 0.894 0.397 0.396 0.371 ...
##  $ ratio_Interestrate                  : num  1 1.292 1.001 1 0.922 ...
##  $ ratio_outstanding                   : num  0.765 0.979 0.887 0.88 0.826 ...
##  $ diff_Salary_Utility                 : num  745 1155 1137 837 668 ...
##  $ ratio_curr_outstanding_Salary       : num  23.5 98 23.2 22 40.1 ...
##  $ Avg_transaction_reqd                : num  30 16.7 44.7 46.1 47.5 ...
summary(input_data)
##    CustomerID    ProfessionalLicensure eCommerceAccount SocialMediaAccount
##  C12116 :    1   Min.   :0.0000        Min.   :0.000    No :3465          
##  C12117 :    1   1st Qu.:0.0000        1st Qu.:0.000    Yes:7606          
##  C12118 :    1   Median :1.0000        Median :1.000                      
##  C12119 :    1   Mean   :0.5722        Mean   :0.523                      
##  C12120 :    1   3rd Qu.:1.0000        3rd Qu.:1.000                      
##  C12121 :    1   Max.   :1.0000        Max.   :1.000                      
##  (Other):11065                                                            
##  NoOfProperties  CreditRiskScore  age_in_years  
##  Min.   :1.000   Min.   :350.0   Min.   :23.00  
##  1st Qu.:1.000   1st Qu.:497.0   1st Qu.:37.00  
##  Median :1.000   Median :641.0   Median :38.00  
##  Mean   :1.429   Mean   :627.2   Mean   :39.04  
##  3rd Qu.:2.000   3rd Qu.:777.0   3rd Qu.:40.00  
##  Max.   :4.000   Max.   :830.0   Max.   :70.00  
##                                                 
##  Total_no_of_loan_tenure_with_current                        Asset_type   
##  Min.   :10.00                        Complex With shared services:  757  
##  1st Qu.:14.00                        No shred services           :10314  
##  Median :23.00                                                            
##  Mean   :24.34                                                            
##  3rd Qu.:33.00                                                            
##  Max.   :51.00                                                            
##                                                                           
##  Villa_House   Investment_SelfOccupied                Payment_Status
##  No :4199    Investment    :1670       Default               :3327  
##  Yes:6872    Self Occupancy:9401       Non-Payoff/Non-Default:3543  
##                                        Payoff                :4201  
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##  Remaining_loan_tenure ratio_RE_inflation
##  Min.   :  1.00        Min.   :0.6965    
##  1st Qu.: 82.00        1st Qu.:0.7783    
##  Median : 92.00        Median :0.8610    
##  Mean   : 88.84        Mean   :0.9714    
##  3rd Qu.:104.00        3rd Qu.:1.0925    
##  Max.   :150.00        Max.   :2.8494    
##                                          
##  ratio_Loan_to_Appraisedvalu_Percent ratio_Interestrate ratio_outstanding
##  Min.   :0.05706                     Min.   :0.2247     Min.   :0.1510   
##  1st Qu.:0.86134                     1st Qu.:1.0000     1st Qu.:0.9341   
##  Median :1.12380                     Median :1.0000     Median :0.9691   
##  Mean   :1.05316                     Mean   :1.2105     Mean   :0.9465   
##  3rd Qu.:1.26403                     3rd Qu.:1.0508     3rd Qu.:0.9872   
##  Max.   :1.54046                     Max.   :8.0250     Max.   :1.1452   
##                                                                          
##  diff_Salary_Utility ratio_curr_outstanding_Salary Avg_transaction_reqd
##  Min.   :  515.2     Min.   :   0.00               Min.   :-496.50     
##  1st Qu.: 1051.9     1st Qu.:  45.40               1st Qu.:  15.00     
##  Median : 1505.1     Median :  72.31               Median :  25.00     
##  Mean   : 3729.7     Mean   :  89.51               Mean   :  26.29     
##  3rd Qu.: 3686.5     3rd Qu.: 103.57               3rd Qu.:  34.47     
##  Max.   :99612.8     Max.   :2084.67               Max.   :4110.60     
## 

Data Pre-processing

  • Attribute ‘ProfessionalLicensure’, ‘eCommerceAccount’ and ‘NoOfProperties’ are categorical variables. Lets convert appropriately.
input_data$ProfessionalLicensure = as.factor(as.character(input_data$ProfessionalLicensure))

input_data$eCommerceAccount = as.factor(as.character(input_data$eCommerceAccount))
input_data$NoOfProperties = as.factor(as.character(input_data$NoOfProperties))

#Now see the structure of the dataframe
str(input_data)
## 'data.frame':    11071 obs. of  20 variables:
##  $ CustomerID                          : Factor w/ 11071 levels "C12116","C12117",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ ProfessionalLicensure               : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 2 1 2 ...
##  $ eCommerceAccount                    : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 1 2 ...
##  $ SocialMediaAccount                  : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 1 2 2 1 ...
##  $ NoOfProperties                      : Factor w/ 4 levels "1","2","3","4": 2 1 1 1 1 1 4 1 1 2 ...
##  $ CreditRiskScore                     : int  711 767 827 795 827 738 670 674 560 744 ...
##  $ age_in_years                        : int  50 38 49 43 44 40 37 36 37 40 ...
##  $ Total_no_of_loan_tenure_with_current: int  24 10 35 35 35 27 27 27 11 27 ...
##  $ Asset_type                          : Factor w/ 2 levels "Complex With shared services",..: 2 2 2 1 2 2 2 2 2 2 ...
##  $ Villa_House                         : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 1 2 2 2 2 ...
##  $ Investment_SelfOccupied             : Factor w/ 2 levels "Investment","Self Occupancy": 2 2 2 2 2 2 1 2 2 2 ...
##  $ Payment_Status                      : Factor w/ 3 levels "Default","Non-Payoff/Non-Default",..: 1 1 2 2 2 2 2 2 1 2 ...
##  $ Remaining_loan_tenure               : int  65 104 45 46 47 80 80 80 98 81 ...
##  $ ratio_RE_inflation                  : num  2.03 1.1 2.27 2.25 2.26 ...
##  $ ratio_Loan_to_Appraisedvalu_Percent : num  0.379 0.894 0.397 0.396 0.371 ...
##  $ ratio_Interestrate                  : num  1 1.292 1.001 1 0.922 ...
##  $ ratio_outstanding                   : num  0.765 0.979 0.887 0.88 0.826 ...
##  $ diff_Salary_Utility                 : num  745 1155 1137 837 668 ...
##  $ ratio_curr_outstanding_Salary       : num  23.5 98 23.2 22 40.1 ...
##  $ Avg_transaction_reqd                : num  30 16.7 44.7 46.1 47.5 ...
  • Identify the categorical and numerical attributes appropriately
# extract the names of categorical columns
categorical_cols = c()

for (coln in names(input_data)){
  if (is.factor(input_data[[coln]])){
    categorical_cols = append(categorical_cols,coln)
  }
}

categorical_cols = setdiff(categorical_cols,"CustomerID")
print(categorical_cols)
## [1] "ProfessionalLicensure"   "eCommerceAccount"       
## [3] "SocialMediaAccount"      "NoOfProperties"         
## [5] "Asset_type"              "Villa_House"            
## [7] "Investment_SelfOccupied" "Payment_Status"
#Now, how to find the numerical attributes?
numerical_cols = setdiff(colnames(input_data), categorical_cols)

numerical_cols = setdiff(numerical_cols,"CustomerID")
numerical_cols
##  [1] "CreditRiskScore"                     
##  [2] "age_in_years"                        
##  [3] "Total_no_of_loan_tenure_with_current"
##  [4] "Remaining_loan_tenure"               
##  [5] "ratio_RE_inflation"                  
##  [6] "ratio_Loan_to_Appraisedvalu_Percent" 
##  [7] "ratio_Interestrate"                  
##  [8] "ratio_outstanding"                   
##  [9] "diff_Salary_Utility"                 
## [10] "ratio_curr_outstanding_Salary"       
## [11] "Avg_transaction_reqd"
  • Convert the Customer IDs to the row names, as this will later help us in visualising the clusters
rownames(input_data) <- input_data$CustomerID
customerID = input_data$CustomerID
  • Drop the CustomerID column as it is now just redundant information
input_data$CustomerID = NULL
# OR
# input_data <- input_data[, -c(colnames(input_data) %in% ("name"))]
  • Find the number of missing values and either impute or omit them
sum(is.na(input_data))
## [1] 0
  • There are no missing values in dataset
# check no missing value in categorical column
# sum(is.na(input_data$shelf))
# had there been any missing value we would have imputed it seperately from numerical using modes.
# library(DMwR)
# input_data[,numerical_cols] <- knnImputation(input_data[,numerical_cols], k = 3, scale = T)
# 
# sum(is.na(input_data))
# Make a copy of the dataframe for later use (mixed attributes)
input_data_ori = input_data
  • Convert the categorical to dummy variables (converting to numeric attributes by using dummy)
# library("dummies")
# shelfDummies = data.frame(dummy(input_data$shelf))
# 
# # name the new attributes appropriately
# names(shelfDummies) = c("Shelf1","Shelf2","Shelf3")
# head(shelfDummies)
# 
# library("dummies")
# 
# dummifiedVarDF = list()
# for (i in 1:length(categorical_cols)){
#   dummifiedVarDF[[i]] = data.frame(dummy(input_data[[categorical_cols[i]]]))
#   names(dummifiedVarDF[[i]]) = levels(input_data[[categorical_cols[i]]])
#   input_data[[categorical_cols[i]]] = NULL
#   input_data = data.frame(cbind(input_data,dummifiedVarDF[[i]]))
# }
# 
# # print(dummifiedVarDF[1])
# head(input_data)
  • Converting categorical features to numeric features by using dummy _ ProfessionalLicensure _
library("dummies")
## dummies-1.5.6 provided by Decision Patterns
ProfessionalLicensure_Dummies = data.frame(dummy(input_data$ProfessionalLicensure))
# name the new attributes appropriately
names(ProfessionalLicensure_Dummies) = c("ProfessionalLicensure0","ProfessionalLicensure1")
# head(ProfessionalLicensure_Dummies)

_ eCommerceAccount _

eCommerceAccount_Dummies = data.frame(dummy(input_data$eCommerceAccount))
# name the new attributes appropriately
names(eCommerceAccount_Dummies) = c("eCommerceAccount0","eCommerceAccount1")
# head(eCommerceAccount_Dummies)

_ SocialMediaAccount _

SocialMediaAccount_Dummies = data.frame(dummy(input_data$SocialMediaAccount))
# naming the new attributes appropriately
names(SocialMediaAccount_Dummies) = c("SocialMediaAccount_No","SocialMediaAccount_Yes")
# head(SocialMediaAccount_Dummies)

_ NoOfProperties _

NoOfProperties_Dummies = data.frame(dummy(input_data$NoOfProperties))
# naming the new attributes appropriately
names(NoOfProperties_Dummies) = c("NoOfProperties_1","NoOfProperties_2", "NoOfProperties_3","NoOfProperties_4")
# head(NoOfProperties_Dummies)

__ Asset_type __

Asset_type_Dummies = data.frame(dummy(input_data$Asset_type))
# naming the new attributes appropriately
names(Asset_type_Dummies) = c("Asset_type_Complex_With_shared_services", "Asset_type_No_shred_services")
# head(Asset_type_Dummies)

__ Villa_House __

Villa_House_Dummies = data.frame(dummy(input_data$Villa_House))
# naming the new attributes appropriately
names(Villa_House_Dummies) = c("Villa_House_No","Villa_House_Yes")
# head(Villa_House_Dummies)

__ Investment_SelfOccupied __

Investment_SelfOccupied_Dummies = data.frame(dummy(input_data$Investment_SelfOccupied))
# naming the new attributes appropriately
names(Investment_SelfOccupied_Dummies) = c("Investment_SelfOccupied_Investment", "Investment_SelfOccupied_Self_Occupancy")
# head(Investment_SelfOccupied_Dummies)

__ Payment_Status __

Payment_Status_Dummies = data.frame(dummy(input_data$Payment_Status))
# naming the new attributes appropriately
names(Payment_Status_Dummies) = c("Payment_Status_Default", "Payment_Status_Non-Payoff/Non-Default", "Payment_Status_Payoff")
# head(Payment_Status_Dummies)
  • Removing the original categorical features and adding the newly created dummy features
input_data$ProfessionalLicensure = NULL
input_data$eCommerceAccount = NULL
input_data$SocialMediaAccount = NULL
input_data$NoOfProperties = NULL
input_data$Asset_type = NULL
input_data$Villa_House = NULL
input_data$Investment_SelfOccupied = NULL
input_data$Payment_Status = NULL


input_data = data.frame(cbind(input_data, ProfessionalLicensure_Dummies, eCommerceAccount_Dummies, SocialMediaAccount_Dummies, NoOfProperties_Dummies, Asset_type_Dummies, Villa_House_Dummies, Investment_SelfOccupied_Dummies, Payment_Status_Dummies))
  • The data must be scaled, before measuring any type of distance metric as the variables with higher ranges will significantly influence the distance
input_data[, numerical_cols] =  scale(input_data[,numerical_cols], center = T, scale = T)

Partitioning clustering:

In the partitioning approach, you specify K: the number of clusters sought. Observations are then randomly divided into K groups and reshufled to form cohesive clusters.

  • Popular algorithms
    • k-means
    • Partioning around medoids (PAM)

K-Means Clustering

K-Means Clustering procedure

  • Let us first try to animate the kmeans process. Build a basic kmeans model with k = 3, using the kmeans.ani() function. This takes an input of size features. Hence considering “diff_Salary_Utility” and “CreditRiskScore”
library(animation)
## Warning: package 'animation' was built under R version 3.5.2
set.seed(123)
km_ani <- kmeans.ani(subset(input_data, select=c("diff_Salary_Utility", "CreditRiskScore")), centers = 3)

As we can see, it needed few iterations to find final clusters

  • Build a basic kmeans model with k = 3, using the kmeans() function
set.seed(123)
km_basic <- kmeans(input_data, centers = 3, nstart = 4)
  • The kmeans() function returns a list of 9 objects which include the cluster assignments (“cluster”), cluster centers (“centers”), etc. You can further explore the returned object by calling the str() function on the returned object and going through the documentation
# plot in 2-d
str(km_basic)
## List of 9
##  $ cluster     : Named int [1:11071] 2 3 2 2 2 3 3 3 1 3 ...
##   ..- attr(*, "names")= chr [1:11071] "C12116" "C12117" "C12118" "C12119" ...
##  $ centers     : num [1:3, 1:30] -0.906 1.131 0.719 -0.325 2.034 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:30] "CreditRiskScore" "age_in_years" "Total_no_of_loan_tenure_with_current" "Remaining_loan_tenure" ...
##  $ totss       : num 159124
##  $ withinss    : num [1:3] 47566 26307 45177
##  $ tot.withinss: num 119051
##  $ betweenss   : num 40073
##  $ size        : int [1:3] 5167 1063 4841
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
  • We can use fviz_cluster() to see the segmentation in 2-d. This function implicitly does PCA and uses the 2 Principal components for plotting.
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.5.2
## Loading required package: ggplot2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
# plot in 2-d
fviz_cluster(km_basic, input_data, labelsize = 7)

  • Let’s now build a sceen plot to choose a “k”
set.seed(123)
fviz_nbclust(input_data, kmeans, method = "wss")

  • Let’s choose k as 3 and cluster the data
set.seed(123)
km_clust <- kmeans(input_data, centers = 3, nstart = 4)

# Store the cluster assignments in a new data frame
input_clusts_km <- as.data.frame(cbind(clust = km_clust$cluster, input_data))

# Look at the head of the data
head(input_clusts_km)
##        clust CreditRiskScore age_in_years
## C12116     2       0.5662109    2.6969057
## C12117     3       0.9444832   -0.2559390
## C12118     2       1.3497751    2.4508353
## C12119     2       1.1336194    0.9744130
## C12120     2       1.3497751    1.2204833
## C12121     3       0.7485922    0.2362018
##        Total_no_of_loan_tenure_with_current Remaining_loan_tenure
## C12116                          -0.03227037            -1.0423143
## C12117                          -1.36712851             0.6626760
## C12118                           1.01654674            -1.9166683
## C12119                           1.01654674            -1.8729506
## C12120                           1.01654674            -1.8292329
## C12121                           0.25377066            -0.3865488
##        ratio_RE_inflation ratio_Loan_to_Appraisedvalu_Percent
## C12116          3.8164550                          -2.6254003
## C12117          0.4794188                          -0.6191181
## C12118          4.6861716                          -2.5561725
## C12119          4.6245219                          -2.5583645
## C12120          4.6505310                          -2.6586567
## C12121         -0.8634853                           0.8826714
##        ratio_Interestrate ratio_outstanding diff_Salary_Utility
## C12116        -0.23521295        -1.9582842          -0.4181081
## C12117         0.09080138         0.3542952          -0.3606889
## C12118        -0.23429193        -0.6412755          -0.3631720
## C12119        -0.23521295        -0.7240622          -0.4052008
## C12120        -0.32280141        -1.3030235          -0.4289440
## C12121        -0.13753936        -0.1880079          -0.3571597
##        ratio_curr_outstanding_Salary Avg_transaction_reqd
## C12116                    -0.7814550           0.08201288
## C12117                     0.1005737          -0.21563142
## C12118                    -0.7848104           0.41180602
## C12119                    -0.7986421           0.44235934
## C12120                    -0.5845045           0.47539331
## C12121                    -0.1667940           0.14422126
##        ProfessionalLicensure0 ProfessionalLicensure1 eCommerceAccount0
## C12116                      1                      0                 0
## C12117                      0                      1                 0
## C12118                      1                      0                 0
## C12119                      1                      0                 0
## C12120                      0                      1                 0
## C12121                      1                      0                 0
##        eCommerceAccount1 SocialMediaAccount_No SocialMediaAccount_Yes
## C12116                 1                     1                      0
## C12117                 1                     0                      1
## C12118                 1                     0                      1
## C12119                 1                     1                      0
## C12120                 1                     1                      0
## C12121                 1                     1                      0
##        NoOfProperties_1 NoOfProperties_2 NoOfProperties_3 NoOfProperties_4
## C12116                0                1                0                0
## C12117                1                0                0                0
## C12118                1                0                0                0
## C12119                1                0                0                0
## C12120                1                0                0                0
## C12121                1                0                0                0
##        Asset_type_Complex_With_shared_services
## C12116                                       0
## C12117                                       0
## C12118                                       0
## C12119                                       1
## C12120                                       0
## C12121                                       0
##        Asset_type_No_shred_services Villa_House_No Villa_House_Yes
## C12116                            1              0               1
## C12117                            1              0               1
## C12118                            1              0               1
## C12119                            0              1               0
## C12120                            1              0               1
## C12121                            1              1               0
##        Investment_SelfOccupied_Investment
## C12116                                  0
## C12117                                  0
## C12118                                  0
## C12119                                  0
## C12120                                  0
## C12121                                  0
##        Investment_SelfOccupied_Self_Occupancy Payment_Status_Default
## C12116                                      1                      1
## C12117                                      1                      1
## C12118                                      1                      0
## C12119                                      1                      0
## C12120                                      1                      0
## C12121                                      1                      0
##        Payment_Status_Non.Payoff.Non.Default Payment_Status_Payoff
## C12116                                     0                     0
## C12117                                     0                     0
## C12118                                     1                     0
## C12119                                     1                     0
## C12120                                     1                     0
## C12121                                     1                     0
  • We can visualise the clusters by plotting the data using the fviz_cluster() function which plots the points on the first two principal components
fviz_cluster(km_clust, input_data, labelsize = 7)

K-Mediods Clustering

library(cluster)
## Warning: package 'cluster' was built under R version 3.5.2
pamx <- pam(input_data, 3, )

# pamx
  • We can visualise the clusters by plotting the data using the fviz_cluster() function which plots the points on the first two principal components
fviz_cluster(pamx, labelsize = 7)

  • Conclusion: In this case, K-Mediods did not give as good cluster segmentation as kmeans